# -*- coding: utf-8 -*-
import scrapy
from noticelist.items import NoticelistItem


class AliyunSpider(scrapy.Spider):
    name = 'noticelist'
    allowed_domains = ['aliyun.com']
    start_urls = ['https://help.aliyun.com/noticelist/9213612.html?spm=a2c4g.11174386.n2.2.40be1051mYX59f']
    nextPage = 1

    def parse(self, response):
        urlList = response.xpath('//li[contains(@class,"y-clear")]')
        mainUrl = 'https://help.aliyun.com'
        for box in urlList:
            purl = box.xpath('a/@href').extract()
            if purl:
                # 将信息组合成下一页的url
                page = mainUrl + purl[0]
                # 返回url
                self.log(page)
                yield scrapy.Request(page, callback=self.parse_detail)
            # 获取下一页的url信息
        url = 'https://help.aliyun.com/notice_list_page/9213612/{}.html'
        self.nextPage =self.nextPage + 1
        if url:
            # 将信息组合成下一页的url
            nextPage = url.format(self.nextPage)
            # 返回url
            if self.nextPage < 11:
                yield scrapy.Request(nextPage, callback=self.parse)  # 增加dont_filter=True这一选项，将过滤功能关闭掉
        # url跟进结束

    # 编写爬取方法
    def parse_detail(self, response):
        item = NoticelistItem()
        # 获取每个div中的课程路径
        item['title'] = response.xpath('//h3/text()').extract_first()
        # 获取div中的标题图片地址
        item['mainBody'] = response.xpath('//*[@id="se-knowledge"]').extract_first()
        for b in response.xpath('//*[@id="se-knowledge"]/p'):
            a = b.xpath('string()').extract_first()
            self.log(a)
        self.log('startA:  ' + item['mainBody'])
        yield item
